This notebook pulls in data on Nipah RBP entry into CHO-EFNB2 and CHO-EFNB3 cells, filters data, calculates stats, and makes figures¶

In [1]:
# this cell is tagged as parameters for `papermill` parameterization
func_scores_E2_file = None
func_scores_E3_file = None

filtered_E2_data = None
filtered_E3_data = None

E2_entry_heatmap = None
E3_entry_heatmap = None

nipah_config = None
altair_config = None
In [2]:
# Parameters
func_scores_E2_file = "results/func_effects/averages/CHO_EFNB2_low_func_effects.csv"
func_scores_E3_file = "results/func_effects/averages/CHO_EFNB3_low_func_effects.csv"
filtered_E2_data = "results/filtered_data/E2_entry_filtered.csv"
filtered_E3_data = "results/filtered_data/E3_entry_filtered.csv"
E2_entry_heatmap = "results/images/E2_entry_heatmap.html"
E3_entry_heatmap = "results/images/E3_entry_heatmap.html"
nipah_config = "nipah_config.yaml"
altair_config = "data/custom_analyses_data/theme.py"
In [3]:
import math
import os
import re
import altair as alt

import numpy as np

import pandas as pd

import scipy.stats

import Bio.SeqIO

import yaml

from Bio import AlignIO
from Bio import PDB
from Bio.Align import PairwiseAligner
In [4]:
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

if os.getcwd() == '/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/':
    pass
    print("Already in correct directory")
else:
    os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
    print("Setup in correct directory")
Setup in correct directory

Read in custom altair theme¶

In [5]:
if altair_config:
    with open(altair_config, 'r') as file:
        exec(file.read())

Import YAML file with parameters¶

read in config data with filtering parameters

In [6]:
with open(nipah_config) as f:
    config = yaml.safe_load(f)

Filter and merge EFNB2 and EFNB3 entry¶

In [7]:
# Import data
func_scores_E2 = pd.read_csv(func_scores_E2_file)
func_scores_E3 = pd.read_csv(func_scores_E3_file)

#func_scores_E2 = pd.read_csv('results/func_effects/averages/CHO_EFNB2_low_func_effects.csv')
#func_scores_E3 = pd.read_csv('results/func_effects/averages/CHO_EFNB3_low_func_effects.csv')
def num_selections_and_filter(df,name):
    df = df[df['mutant'] != '-']
    
    #Make a couple plots before filtering
    def plot_mean_cell_entry_mutant(df):
        mean_df = df.groupby('mutant')['effect'].median().reset_index()
        chart = alt.Chart(mean_df).mark_circle(size=100,color='black',opacity=1).encode(
            x=alt.X('mutant:O',axis=alt.Axis(grid=True),sort=alt.EncodingSortField(field='effect', order='descending'),title='Mutant Amino Acid'),
            y=alt.Y('effect',title=f'Median Cell Entry for {name}',axis=alt.Axis(grid=True,tickCount=4)),
        ).properties(
            height=alt.Step(10),
            width=alt.Step(10)
        )
        return chart.display()
    plot_mean_cell_entry_mutant(df)

    def plot_times_seen_std(df):
        chart = alt.Chart(df).mark_circle(size=20,color='black',opacity=0.2).encode(
            x=alt.X('times_seen',axis=alt.Axis(grid=True),title=f'Times Seen for {name}',scale=alt.Scale(type='log')),
            y=alt.Y('effect_std',title='Effect Std',axis=alt.Axis(grid=True,tickCount=4)),
            tooltip=['site','times_seen','effect_std','effect']
        ).properties(
            height=alt.Step(10),
            width=alt.Step(10)
        )
        return chart.display()
    plot_times_seen_std(df)
    
    max_sels = df['n_selections'].max()
    num_sels_cutoff = (max_sels/2) + 1
    print(f'The number of selections a mutant must be observed in is: {num_sels_cutoff}')

    total_mut = (602-71) * 20

    filter_test = df[
            (df['site'] != 603) &
            (df['mutant'] != '-') &
            (df['mutant'] != '*') 
        ]
    num_variants_pre_filter = filter_test.shape[0]
    print(f'Before filtering there are {num_variants_pre_filter} mutants which is {(num_variants_pre_filter/total_mut) * 100:.1f}%')
    
    filter_test_times_seen = filter_test[filter_test['times_seen'] >= config['func_times_seen_cutoff']]
    num_variants_times_seen = filter_test_times_seen.shape[0]
    print(f"After filtering for {config['func_times_seen_cutoff']} times seen, there are {num_variants_times_seen}, which is {(num_variants_times_seen/total_mut) * 100:.1f}%")

    filter_test_effect_std = filter_test_times_seen[filter_test_times_seen['effect_std'] <= config['func_std_cutoff']]
    num_variants_std = filter_test_effect_std.shape[0]
    print(f"After filtering for {config['func_std_cutoff']} std cutoff, there are {num_variants_std}, which is {(num_variants_std/total_mut) * 100 :.1f}%")

    filter_test_n_selections = filter_test_effect_std[filter_test_effect_std['n_selections'] >= num_sels_cutoff]
    num_variants_n_selections = filter_test_n_selections.shape[0]
    print(f'After filtering for mutants in in all selections, there are {num_variants_n_selections}, which is {(num_variants_n_selections/total_mut) * 100 :.1f}%')

    for times_seen in [2.5,3,3.5,4]:
        filter_test = filter_test[filter_test['times_seen'] >= times_seen]
        print(f'For {times_seen} times seen cutoff, there are {filter_test.shape[0]} mutations, which is {(filter_test.shape[0]/total_mut) * 100 :.1f}%')

    
    max_sels = df['n_selections'].max()
    num_sels_cutoff = (max_sels/2) + 1
    df = df[
        (df['site'] != 603) &
        (df['mutant'] != '-') &
        (df['mutant'] != '*') &
        (df['times_seen'] >= config['func_times_seen_cutoff']) &
        (df['effect_std'] <= config['func_std_cutoff']) &
        (df['n_selections'] >= num_sels_cutoff)
    ]
    plot_mean_cell_entry_mutant(df)
    
    #Now write filtered_data for mapping
    if name == 'E2':
        df.to_csv(filtered_E2_data,index=False)
    if name == 'E3':
        df.to_csv(filtered_E3_data,index=False)
    return df



func_scores_E2 = num_selections_and_filter(func_scores_E2,'E2')
func_scores_E3 = num_selections_and_filter(func_scores_E3,'E3')

merged_df = pd.merge(
    func_scores_E2,
    func_scores_E3,
    on=['site','mutant','wildtype'],
    how='outer',
    suffixes=['_E2','_E3']
)

display(merged_df.describe())
The number of selections a mutant must be observed in is: 5.0
Before filtering there are 10607 mutants which is 99.9%
After filtering for 2 times seen, there are 9811, which is 92.4%
After filtering for 1 std cutoff, there are 9729, which is 91.6%
After filtering for mutants in in all selections, there are 9725, which is 91.6%
For 2.5 times seen cutoff, there are 9545 mutations, which is 89.9%
For 3 times seen cutoff, there are 9259 mutations, which is 87.2%
For 3.5 times seen cutoff, there are 8806 mutations, which is 82.9%
For 4 times seen cutoff, there are 8315 mutations, which is 78.3%
The number of selections a mutant must be observed in is: 4.5
Before filtering there are 10607 mutants which is 99.9%
After filtering for 2 times seen, there are 9852, which is 92.8%
After filtering for 1 std cutoff, there are 9714, which is 91.5%
After filtering for mutants in in all selections, there are 9586, which is 90.3%
For 2.5 times seen cutoff, there are 9535 mutations, which is 89.8%
For 3 times seen cutoff, there are 9168 mutations, which is 86.3%
For 3.5 times seen cutoff, there are 8411 mutations, which is 79.2%
For 4 times seen cutoff, there are 7775 mutations, which is 73.2%
site effect_E2 effect_std_E2 times_seen_E2 n_selections_E2 effect_E3 effect_std_E3 times_seen_E3 n_selections_E3
count 9885.000000 9725.000000 9725.000000 9725.000000 9725.000000 9586.000000 9586.000000 9586.000000 9586.000000
mean 337.222964 -0.918767 0.322007 7.196755 7.987249 -0.952189 0.344646 6.142469 6.984769
std 153.228045 1.402207 0.235383 4.076880 0.115810 1.458882 0.246157 3.272205 0.139967
min 71.000000 -3.518000 0.000000 2.000000 5.000000 -3.608000 0.000000 2.000000 5.000000
25% 205.000000 -2.214000 0.147400 4.625000 8.000000 -2.501250 0.173000 4.143000 7.000000
50% 338.000000 -0.266600 0.301400 6.375000 8.000000 -0.222150 0.328000 5.429000 7.000000
75% 470.000000 0.219000 0.466300 8.500000 8.000000 0.223875 0.494650 7.143000 7.000000
max 602.000000 0.640400 1.000000 64.380000 8.000000 0.660100 1.000000 49.140000 7.000000

Stats¶

In [8]:
def calculate_stats(df,name):
    #df = df[df['times_seen'] > cutoff]
    total_mut = (602-71) * 20
    print(total_mut)
    muts_present = df['effect'].shape[0]
    fraction_muts = muts_present / total_mut
    print(f'fraction muts present in {name} is {fraction_muts:.2f} {muts_present}/{total_mut}')

    deleterious_muts = df[df['effect'] <= -0.25].shape[0]
    neutral_muts = df[(df['effect'] > -0.25) & (df['effect'] < 0.25)].shape[0]
    positive_muts = df[df['effect'] > 0.25].shape[0]

    frac_bad_muts = deleterious_muts / muts_present
    frac_neutral_muts = neutral_muts / muts_present
    frac_pos_muts = positive_muts / muts_present
    print(f'The number of deleterious mutants for {name} is {frac_bad_muts:.2f} {deleterious_muts}/{muts_present}')
    print(f'The number of neutral mutants for {name} is {frac_neutral_muts:.2f} {neutral_muts}/{muts_present}')
    print(f'The number of positive mutants for {name} is {frac_pos_muts:.2f} {positive_muts}/{muts_present}')

calculate_stats(func_scores_E2,'E2')
calculate_stats(func_scores_E3,'E3')
10620
fraction muts present in E2 is 0.92 9725/10620
The number of deleterious mutants for E2 is 0.50 4906/9725
The number of neutral mutants for E2 is 0.26 2576/9725
The number of positive mutants for E2 is 0.23 2243/9725
10620
fraction muts present in E3 is 0.90 9586/10620
The number of deleterious mutants for E3 is 0.49 4719/9586
The number of neutral mutants for E3 is 0.28 2687/9586
The number of positive mutants for E3 is 0.23 2180/9586
In [9]:
def effect_histogram(df):
    base = alt.Chart(df).mark_bar(opacity=0.5).encode(
        alt.X(alt.repeat("column"), type='quantitative', bin=alt.Bin(maxbins=10)),  
        alt.Y('count()', stack=None),  
        color=alt.Color('Experiment:N')
    ).properties(
        width=300,
        height=alt.Step(10)
    )
    # Create the layered histogram
    histogram = alt.layer(
        base.encode(alt.X('effect_E2:Q', bin=alt.Bin(maxbins=100),axis=alt.Axis(values=[1, 0, -1, -2, -3, -4]),scale=alt.Scale(domain=[-4,1])), color=alt.value('#1f4e79')),
        base.encode(alt.X('effect_E3:Q', bin=alt.Bin(maxbins=100),axis=alt.Axis(values=[1, 0, -1, -2, -3, -4]),scale=alt.Scale(domain=[-4,1])), color=alt.value('#ff7f0e'))
    )
    return histogram.display()
    
effect_histogram(merged_df)
In [10]:
def overall_stats_median(df,effect):
    mean_df = df.groupby('site')[[effect]].max().reset_index()
    total_sites = len(mean_df['site'].unique().tolist())
    mean_df = mean_df[mean_df[effect] < 0]
    subset = len(mean_df['site'].unique().tolist())
    fraction = subset/total_sites
    percent = fraction * 100
    print(f'The number of sites with median entry score less than 0 for {effect} is : {subset}')
    print(f'The percent of sites with median entry score less than 0 for {effect} is : {percent:.0f}')

overall_stats_median(func_scores_E2,'effect')
overall_stats_median(func_scores_E3,'effect')
The number of sites with median entry score less than 0 for effect is : 70
The percent of sites with median entry score less than 0 for effect is : 13
The number of sites with median entry score less than 0 for effect is : 78
The percent of sites with median entry score less than 0 for effect is : 15

How many sites and which sites only have negative entry scores for mutations?¶

In [11]:
def overall_stats_all_neg(df,effect):
    filtered_df = df.groupby('site').filter(lambda group: (group[effect] < 0).all())
    unique = filtered_df['site'].unique()
    print(list(unique))
    total_sites = df['site'].unique().shape[0]
    subset = filtered_df['site'].unique().shape[0]
       
    fraction = subset/total_sites
    percent = fraction * 100
    print(f'The total number of sites are: {total_sites}')
    print(f' The number of sites where all mutants are negative for {effect}: {subset}')
    print(f' The percent of sites where all mutants are negative for {effect}: {percent:.0f}')
    return unique

intolerant_sites_E2 = list(overall_stats_all_neg(func_scores_E2,'effect'))
intolerant_sites_E3 = list(overall_stats_all_neg(func_scores_E3,'effect'))
[80, 106, 107, 108, 111, 112, 113, 120, 121, 125, 126, 127, 130, 138, 146, 151, 157, 158, 159, 162, 163, 165, 167, 172, 189, 203, 205, 206, 207, 208, 216, 229, 240, 246, 251, 253, 254, 257, 258, 259, 260, 262, 263, 264, 266, 267, 303, 323, 331, 347, 355, 382, 387, 395, 412, 460, 467, 487, 489, 493, 499, 500, 503, 506, 537, 563, 565, 574, 594, 598]
The total number of sites are: 532
 The number of sites where all mutants are negative for effect: 70
 The percent of sites where all mutants are negative for effect: 13
[95, 100, 106, 107, 108, 111, 112, 113, 121, 125, 126, 138, 146, 158, 162, 163, 201, 203, 206, 207, 216, 229, 240, 243, 248, 251, 253, 257, 258, 259, 260, 263, 266, 303, 347, 352, 355, 368, 382, 387, 389, 395, 412, 439, 458, 460, 467, 486, 487, 489, 493, 494, 497, 499, 500, 501, 503, 504, 505, 506, 510, 526, 531, 532, 533, 537, 556, 557, 563, 565, 573, 574, 579, 581, 584, 585, 588, 594]
The total number of sites are: 532
 The number of sites where all mutants are negative for effect: 78
 The percent of sites where all mutants are negative for effect: 15
In [12]:
def calculate_top_func_scores(df,effect):
    percentile_95_effect_E2 = df[effect].quantile(0.999)
    cutoff_E2_df_sites = df[df[effect] > percentile_95_effect_E2]
    E2_site_cutoff = cutoff_E2_df_sites['site'].unique()
    print(f'The sites with the highest functional scores for {effect} are: {list(E2_site_cutoff)}')

calculate_top_func_scores(func_scores_E2,'effect')
calculate_top_func_scores(func_scores_E3,'effect')
The sites with the highest functional scores for effect are: [280, 305, 407, 463, 468, 501, 552, 556, 584, 599]
The sites with the highest functional scores for effect are: [183, 315, 337, 358, 378, 393, 418, 419, 554, 597]

Make bubble plots of receptor contact site type (median values per site)¶

In [13]:
def make_boxplot_entry_region(df,effect):# Create a box plot using Altair for aggregated means
    barrel_ranges = {
    'Hydrophobic': config['hydrophobic'],
    'Salt Bridges': config['salt_bridges'],
    'Hydrogen Bonds': config['h_bond_total'],
    'Contact': config['contact_sites'],
    'Overall': list(range(71,602)),
    }    
    mean_df = df.groupby('site')[['effect']].median().reset_index()
    custom_order = ['Hydrophobic','Salt Bridges','Hydrogen Bonds','Contact','Overall']
    
    agg_means = []
    # For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
    for barrel, sites in barrel_ranges.items():
        subset = mean_df[mean_df['site'].isin(sites)]
        for _, row in subset.iterrows():
            agg_means.append({'barrel': barrel, 'effect': row['effect'],'site':row['site']})
        agg_means_df = pd.DataFrame(agg_means)
    if effect == 'E2':
        effect_name = 'EFNB2'
    else:
        effect_name = 'EFNB3'    
    chart = alt.Chart(agg_means_df).mark_point(filled=True,size=50,opacity=0.7).encode(
                x=alt.X('barrel:O',sort=custom_order,title=None,axis=alt.Axis(labelAngle=-90)),
                y=alt.Y('effect',title=f'Median Entry for {effect_name}',axis=alt.Axis(grid=True,tickCount=4)),
                xOffset='random:Q',
                color = alt.Color('barrel').legend(None),
                tooltip=['barrel', 'effect','site'],
            ).transform_calculate(
                random="sqrt(-1*log(random()))*cos(2*PI*random())"
        
            ).properties(
                height=300,
                width=alt.Step(20)
            )
    
    return chart.display()
make_boxplot_entry_region(func_scores_E2,'E2')
make_boxplot_entry_region(func_scores_E3,'E3')

Make bubble plots depending on amino acid property¶

In [14]:
def make_boxplot_entry_region(df,effect):
    barrel_ranges = {
    'hydrophobic_AA': list(['A','V','L','I','M']),
    'aromatic_AA': list(['Y','W','F']),
    'positive_AA': list(['K','R','H']),
    'negative_AA': list(['E','D']),
    'hydrophilic_AA': list(['S','T','N','Q']),
    'special_AA': list(['C','P','G']),
    }

    mean_df = df.groupby(['site','wildtype'])['effect'].median().reset_index()
    if effect == 'E2':
        effect_name = 'EFNB2'
    else:
        effect_name = 'EFNB3'
    
    agg_means = []
    
    # For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
    for barrel, sites in barrel_ranges.items():
        subset = mean_df[mean_df['wildtype'].isin(sites)]
        for _, row in subset.iterrows():
            agg_means.append({'barrel': barrel, 'effect': row['effect'],'site':row['site']})
    agg_means_df = pd.DataFrame(agg_means)
        
    chart = alt.Chart(agg_means_df).mark_point(filled=True).encode(
                x=alt.X('barrel:O',title=None,axis=alt.Axis(labelAngle=-90)), #sort=custom_order
                y=alt.Y('effect',title=f'Median Entry for {effect_name}',axis=alt.Axis(grid=True,tickCount=4)),
                xOffset='random:Q',
                color = alt.Color('barrel').legend(None),
                tooltip=['barrel', 'effect','site'],
            ).transform_calculate(
                random="sqrt(-1*log(random()))*cos(2*PI*random())"
        
            ).properties(
                height=alt.Step(50),
                width=alt.Step(50)
    )        
    return chart.display()

make_boxplot_entry_region(func_scores_E2,'E2')
make_boxplot_entry_region(func_scores_E3,'E3')

Make boxplot showing median entry by RBP region¶

In [15]:
def make_boxplot_entry_region(df,effect):
    barrel_ranges = {
    'Stalk': list(range(96, 147)),
    'Neck': list(range(148,165)),
    'Linker': list(range(166,177)),
    'Head': list(range(178,602)),
    'Total': list(range(71,602)),
    }
    custom_order=['Stalk','Neck','Linker','Head','Total']
    mean_df = df.groupby('site')['effect'].median().reset_index()
    
    if effect == 'E2':
        effect_name = 'EFNB2'
    else:
        effect_name = 'EFNB3'
    agg_means = []
    # For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
    for barrel, sites in barrel_ranges.items():
        subset = mean_df[mean_df['site'].isin(sites)]
        for _, row in subset.iterrows():
            agg_means.append({'barrel': barrel, 'effect': row['effect'],'site':row['site']})
        agg_means_df = pd.DataFrame(agg_means)
        
    chart = alt.Chart(agg_means_df).mark_boxplot(extent='min-max',opacity=1).encode(
                x=alt.X('barrel:O',sort=custom_order,title=None,axis=alt.Axis(labelAngle=-90)),
                y=alt.Y('effect',title=f'Median Entry for {effect_name}',axis=alt.Axis(grid=True,tickCount=4)),
                color = alt.Color('barrel').legend(None),
                tooltip=['barrel', 'effect','site'],
            ).properties(
                height=alt.Step(50),
                width=alt.Step(50)
    )        
    return chart.display()

make_boxplot_entry_region(func_scores_E2,'E2')
make_boxplot_entry_region(func_scores_E3,'E3')

Make boxplot of functional domains¶

In [16]:
def make_boxplot_func_domains(df,effect):
    if effect == 'effect_E2':
        effect_name = 'EFNB2'
    else:
        effect_name = 'EFNB3'    
    
    barrel_ranges = {
        'Receptor Contact': config['contact_sites'],
        'Dimerization' : [160, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 202, 203, 204, 205, 206, 208, 210, 212, 237, 238, 239, 240, 242, 247, 249, 256, 258, 259, 260, 261, 266, 267, 268, 270, 323, 324, 325, 331, 584, 589, 591],
        'Overall': list(range(71,602)),
    }
    custom_order=['Receptor Contact','Dimerization','Overall']
    
    mean_df = df.groupby('site')[[effect]].median().reset_index()
    
    agg_means = []
    # For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
    for barrel, sites in barrel_ranges.items():
        subset = mean_df[mean_df['site'].isin(sites)]
        for _, row in subset.iterrows():
            agg_means.append({'barrel': barrel, 'effect': row[effect],'site':row['site']})
    
    agg_means_df = pd.DataFrame(agg_means)
    
    # Create a box plot using Altair for aggregated means
    chart = alt.Chart(agg_means_df).mark_boxplot(extent='min-max',opacity=1).encode(
        x=alt.X('barrel:O',sort=custom_order,title='Region',axis=alt.Axis(labelAngle=-90)),
        y=alt.Y('effect',title=f'Median Cell Entry for {effect_name}', axis=alt.Axis(grid=True,tickCount=4)),
        color = alt.Color('barrel').legend(None),
        tooltip=['barrel', 'effect','site'],
        #xOffset='random:Q',
    ).properties(
        height=alt.Step(20),
        width=alt.Step(20)
    )
    return chart.display()

make_boxplot_func_domains(merged_df,'effect_E2')
make_boxplot_func_domains(merged_df,'effect_E3')

Generate Full Heatmap For EFNB2 and EFNB3¶

First, prep the data for heatmaps:

Make dataframe from entropy for heatmap

In [17]:
entropy_df = pd.read_csv(config['entropy'])
entropy_df = entropy_df[['site','henipavirus_entropy']]
entropy_df = entropy_df.dropna(subset=['site'])
entropy_df['site'] = entropy_df['site'].astype('Int64')
entropy_df = entropy_df.rename(columns={'henipavirus_entropy':'entropy'})
entropy_df = entropy_df[['site','entropy']].drop_duplicates()
entropy_df['mutant'] = 'entropy'
entropy_df['wildtype'] = ''  
entropy_df['effect'] = 'effect'
entropy_df.rename(columns={'entropy': 'value'}, inplace=True)
display(entropy_df.head(3))

entropy_sites = entropy_df[entropy_df['value'] == 0]
entropy_sites = list(entropy_sites['site'].unique())
print(entropy_sites) #These are sites conserved across all henipaviruses, used later
site value mutant wildtype effect
0 1 -0.00 entropy effect
1 2 1.75 entropy effect
2 3 1.75 entropy effect
[1, 17, 23, 38, 63, 97, 101, 104, 105, 107, 114, 119, 124, 146, 148, 151, 153, 158, 160, 162, 165, 181, 189, 203, 216, 220, 222, 229, 231, 233, 235, 240, 253, 254, 263, 282, 295, 324, 345, 355, 365, 382, 393, 395, 398, 439, 440, 454, 455, 457, 459, 460, 462, 479, 485, 486, 487, 488, 493, 494, 499, 500, 503, 506, 508, 509, 510, 524, 526, 528, 530, 532, 533, 534, 535, 540, 546, 561, 565, 566, 573, 574, 575, 579, 588, 589, 590, 598, 601]

Make dataframe for contact sites for heatmap

In [18]:
def make_contact():
    df = pd.DataFrame({
    'site': config['contact_sites'],
    'contact': [0.0] * len(config['contact_sites'])
    })
    df = df[['site','contact']]
    df['mutant'] = 'contact'
    df['wildtype'] = ''
    df['effect'] = 'effect'
    df.rename(columns={'contact':'value'}, inplace=True)
    return df
        
contact_df = make_contact()
display(contact_df.head(3))
site value mutant wildtype effect
0 239 0.0 contact effect
1 240 0.0 contact effect
2 241 0.0 contact effect

Make an empty data frame for all sites and mutations, then merge with func_scores so heatmap will have gray sites where no mutations were recorded. Finally, merge in entropy and contact for plotting

In [19]:
def make_empty_df(df):
    sites = range(71, 603)
    amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
    # Create the combination of each site with each amino acid
    data = [{'site': site, 'mutant': aa} for site in sites for aa in amino_acids]
    # Create the DataFrame
    empty_df = pd.DataFrame(data)
    all_sites_df = pd.merge(empty_df,df,on=['site','mutant'],how='left')

    df_test = all_sites_df.melt(id_vars=['site', 'mutant', 'wildtype'],
                                     value_vars=['effect'], 
                                     var_name='effect', value_name='value')

    df_test = pd.concat([df_test,entropy_df ,contact_df],ignore_index=True) #entropy_df ,contact_df
    return df_test


empty_df_E2 = make_empty_df(func_scores_E2)
empty_df_E3 = make_empty_df(func_scores_E3)
display(empty_df_E2)
site mutant wildtype effect value
0 71 A NaN effect NaN
1 71 C Q effect -1.843
2 71 D Q effect -1.312
3 71 E Q effect -1.137
4 71 F Q effect -1.187
... ... ... ... ... ...
11270 579 contact effect 0.000
11271 580 contact effect 0.000
11272 581 contact effect 0.000
11273 583 contact effect 0.000
11274 588 contact effect 0.000

11275 rows × 5 columns

Make heatmap now with dataframe produced above. I specify range to split the heatmap into four equal rows.

In [20]:
def plot_heatmap_full_update_test(df,legend_title):
    # Create y-axis order list
    custom_order = ['contact','entropy','R', 'K', 'H', 'D', 'E', 'Q', 'N', 'S', 'T', 'Y', 'W', 'F', 'A', 'I', 'L', 'M', 'V', 'G', 'P', 'C']    #custom_order = names + custom_order
    final_df = df
    
    #Sites for wrapping heatmap correctly
    full_ranges = [list(range(start, end)) for start, end in [(71, 204), (204, 337), (337, 470), (470, 603)]]
    
    # Sort the dataframe by 'site' to ensure that duplicates are detected correctly.
    final_df = final_df.sort_values('site')
    sort_order = {mutant: i for i, mutant in enumerate(custom_order)}
    final_df['mutant_rank'] = final_df['mutant'].map(sort_order) # Map the 'mutant' column to these ranks
    # Now sort the dataframe by this rank
    final_df = final_df.sort_values('mutant_rank')
    # Drop the 'mutant_rank' column as it is no longer needed after sorting
    final_df = final_df.drop(columns=['mutant_rank'])
    sites = sorted(final_df['site'].unique(), key=lambda x: float(x))
    
    # Prepare the color scales separately for entropy and effects
    color_scale_effect = alt.Scale(scheme='redblue', domainMid=0, domain=[-4, 2.5])
    color_scale_entropy = alt.Scale(scheme='purples', domain=[0, 2],reverse=True)
    color_scale_contact = alt.Scale(scheme='purples', domainMid=0, domain=[0, 5],reverse=True)

    # set strokewidth to control stroke size around each individual box
    strokewidth_size = 0.25
    # container to hold the charts
    charts = [] 
    # Flags for showing the legend only the first time
    effect_legend_added = False
    entropy_legend_added = False
    for subset in full_ranges: 
        subset_df = final_df[final_df['site'].isin(subset)] #for the wrapping of sites
        unique_wildtypes_df = subset_df.drop_duplicates(subset=['site','wildtype']) #for the wildtype mapping
        # The chart for the heatmap
        base = (
            alt.Chart(subset_df)
            .encode(
                x=alt.X('site:O', title='Site', sort=sites, axis=alt.Axis(labelExpr="datum.value % 10 === 0 ? datum.value : ''",labelAngle=-90)),
                y=alt.Y('mutant', title='Amino Acid', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
                tooltip=['site','wildtype','mutant','value'],
            ).properties(
                width=alt.Step(10),
                height=alt.Step(11) #Have to make slightly taller to fit everything
            )
        )
        # Individual Heatmaps
        chart_empty = (
            base.mark_rect(color='#d1d3d4').encode().transform_filter( ##bcbec0
                (alt.datum.effect == 'effect')
            )
        )
        if not effect_legend_added:
            chart_effect = (
                base.mark_rect(stroke='black',strokeWidth = strokewidth_size).encode(
                color=alt.condition(f'datum.mutant != "entropy" && datum.mutant != "contact"', 
                alt.Color('value:Q', scale=color_scale_effect,legend=alt.Legend(title=legend_title)), 
                alt.value('transparent')),
                ).transform_filter(
                    (alt.datum.effect == 'effect') & (alt.datum.wildtype != None) & (alt.datum.value != None)
                    
                )
            )
            effect_legend_added = True
        else:
            chart_effect = (
                base.mark_rect(stroke='black',strokeWidth = strokewidth_size).encode(
                color=alt.condition(f'datum.mutant != "entropy" && datum.mutant != "contact"', 
                alt.Color('value:Q', scale=color_scale_effect,legend=None), 
                alt.value('transparent')),
                ).transform_filter(
                    (alt.datum.effect == 'effect') & (alt.datum.wildtype != None) & (alt.datum.value != None)
                    
                )
            )
        if not entropy_legend_added:
            chart_entropy = (
                base.mark_rect().encode(
                color=alt.condition(f'datum.mutant == "entropy"', 
                alt.Color('value:Q', scale=color_scale_entropy,legend=alt.Legend(title='Henipavirus Entropy')), 
                alt.value('transparent')),
                ).transform_filter(
                (alt.datum.mutant == 'entropy') & (alt.datum.wildtype != None) & (alt.datum.value != None)
                    
                )
            )
            entropy_legend_added = True
        else:
            chart_entropy = (
                base.mark_rect().encode(
                color=alt.condition(f'datum.mutant == "entropy"', 
                alt.Color('value:Q', scale=color_scale_entropy,legend=None), 
                alt.value('transparent')),
                ).transform_filter(
                (alt.datum.mutant == 'entropy') & (alt.datum.wildtype != None) & (alt.datum.value != None)
                    
                )
            )
        chart_contact = (
            base.mark_rect(color='black').encode(
            #color=alt.condition(f'datum.mutant == "contact"', 
            #alt.Color('value:Q', scale=color_scale_contact,legend=None), 
            #alt.value('transparent')),
            ).transform_filter(
            (alt.datum.mutant == 'contact') & (alt.datum.wildtype != None) & (alt.datum.value != None)
                
            )
        )
        # The layer for the wildtype boxes
        wildtype_layer_box = (
            alt.Chart(unique_wildtypes_df).mark_rect(color='white',stroke='black',strokeWidth = strokewidth_size).encode(
                x=alt.X('site:O', sort=sites),
                y=alt.Y('wildtype', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
                tooltip=['site','wildtype'],
                opacity=alt.value(1)  
            )
            .transform_filter(
                (alt.datum.wildtype != '') & (alt.datum.wildtype != None) & (alt.datum.value != None)
            )
        )
        #display(unique_wildtypes_df)
        # The layer for the wildtype amino acids
        wildtype_layer = (
            alt.Chart(unique_wildtypes_df).mark_text(color='black', text='X', size=8,align='center', baseline='middle').encode(
                x=alt.X('site:O', sort=sites),
                y=alt.Y('wildtype', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
                tooltip=['site','wildtype'],
                opacity=alt.value(1)  
            )
            .transform_filter(
                (alt.datum.mutant != 'entropy') & (alt.datum.wildtype != '') & (alt.datum.wildtype != None) & (alt.datum.value != None)
            )
        )    
        # Combine the heatmap layer with the wildtype layer
        chart = alt.layer(chart_empty, chart_effect, chart_entropy, chart_contact, wildtype_layer_box, wildtype_layer).resolve_scale(color='independent')
        
        charts.append(chart)
    
    combined_chart = alt.vconcat(*charts).resolve_scale(y='independent', x='independent',color='shared')
    return combined_chart#.display()

E2_entry_heatmap_full = plot_heatmap_full_update_test(empty_df_E2,'CHO-EFNB2 Entry')
E2_entry_heatmap_full.save(E2_entry_heatmap)
E2_entry_heatmap_full.display()
E3_entry_heatmap_full = plot_heatmap_full_update_test(empty_df_E3,'CHO-EFNB3 Entry')
E3_entry_heatmap_full.save(E3_entry_heatmap)
E3_entry_heatmap_full.display()

Find sites that have unique entry scores by amino acid property and show with heatmap¶

In [21]:
def check_muts_by_properties(df,min_group_size,positive_value,negative_value):
    letter_to_type = {'D': 'negative', 
                      'E': 'negative', 
                      'K': 'positive', 
                      'R': 'positive', 
                      'H': 'positive', 
                      'C': 'special',
                      'S': 'hydrophilic', 
                      'T': 'hydrophilic',
                      'N': 'hydrophilic', 
                      'Q': 'hydrophilic',
                      'G': 'special',
                      'A': 'hydrophobic', 
                      'V': 'hydrophobic',
                      'L': 'hydrophobic',
                      'I': 'hydrophobic', 
                      'M': 'hydrophobic',
                      'P': 'special',
                      'F': 'aromatic', 
                      'Y': 'armomatic', 
                      'W': 'aromatic', 
                      '*': 'stop'}

    df['mutant_type'] = df['mutant'].map(letter_to_type)
    grouped = df.groupby(['site', 'mutant_type'])
    filtered_groups = grouped.filter(lambda x: (x['effect'] > positive_value).all() and len(x) >= min_group_size) 
    #Positive Mask
    positive_mask = (
        filtered_groups.groupby('site')['effect'].transform(lambda x: (x > positive_value).all()) &
        df.groupby('site')['effect'].transform(lambda x: (x <= positive_value).any())
    )
    positive_sites = df[positive_mask]

    #Negative mask
    filtered_groups_neg = grouped.filter(lambda x: (x['effect'] < negative_value).all() and len(x) >= min_group_size)
    negative_mask = (
        filtered_groups_neg.groupby('site')['effect'].transform(lambda x: (x < negative_value).all()) &
        df.groupby('site')['effect'].transform(lambda x: (x <= negative_value).any())
    )
    negative_sites = df[negative_mask]
    
    common_sites = pd.merge(positive_sites, negative_sites, on='site', how='inner')
    final_sites = common_sites['site'].drop_duplicates()
    return final_sites


final_sites_E2 = check_muts_by_properties(func_scores_E2,2,-0.25,-2)
final_sites_E3 = check_muts_by_properties(func_scores_E3,2,-0.25,-2)
print(list(final_sites_E2))
print(list(final_sites_E3))
[76, 78, 93, 103, 136, 139, 141, 144, 145, 154, 156, 164, 228, 230, 232, 237, 244, 247, 248, 250, 268, 283, 284, 285, 286, 294, 295, 296, 316, 349, 350, 400, 414, 416, 426, 431, 438, 465, 469, 475, 477, 482, 509, 511, 521, 531, 547, 576]
[79, 101, 118, 119, 141, 150, 154, 155, 228, 231, 235, 241, 250, 256, 283, 284, 285, 286, 291, 294, 296, 298, 305, 316, 319, 348, 350, 354, 408, 411, 414, 416, 431, 437, 442, 443, 444, 465, 469, 475, 477, 482, 485, 491, 492, 511, 514, 520, 521, 530, 546, 547, 551, 552, 559, 575, 576, 577, 578, 591]
In [22]:
def plot_heatmap_full_update_test(df,sites_,legend_title):
    # Create y-axis order list
    custom_order = ['contact','entropy','R', 'K', 'H', 'D', 'E', 'Q', 'N', 'S', 'T', 'Y', 'W', 'F', 'A', 'I', 'L', 'M', 'V', 'G', 'P', 'C']    #custom_order = names + custom_order
    final_df = df
    
    # Sort the dataframe by 'site' to ensure that duplicates are detected correctly.
    final_df = final_df.sort_values('site')
    sort_order = {mutant: i for i, mutant in enumerate(custom_order)}
    final_df['mutant_rank'] = final_df['mutant'].map(sort_order) # Map the 'mutant' column to these ranks
    # Now sort the dataframe by this rank
    final_df = final_df.sort_values('mutant_rank')
    # Drop the 'mutant_rank' column as it is no longer needed after sorting
    final_df = final_df.drop(columns=['mutant_rank'])
    sites = sorted(final_df['site'].unique(), key=lambda x: float(x))
    
    # Prepare the color scales separately for entropy and effects
    color_scale_effect = alt.Scale(scheme='redblue', domainMid=0, domain=[-4, 2.5])
    color_scale_entropy = alt.Scale(scheme='purples', domain=[0, 2],reverse=True)
    color_scale_contact = alt.Scale(scheme='purples', domainMid=0, domain=[0, 5],reverse=True)
    strokewidth_size = 0.25
    
    # container to hold the charts
    charts = [] 
    
    #for subset in full_ranges: 
    subset_df = final_df[final_df['site'].isin(sites_)] #for the wrapping of sites
    unique_wildtypes_df = subset_df.drop_duplicates(subset=['site','wildtype']) #for the wildtype mapping
    
    # The chart for the heatmap
    base = (
        alt.Chart(subset_df)
        .encode(
            x=alt.X('site:O', title=None, sort=sites, axis=alt.Axis(labelAngle=-90)),
            y=alt.Y('mutant', title='Amino Acid', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
            tooltip=['site','wildtype','mutant','value'],
        ).properties(
            width=alt.Step(10),
            height=alt.Step(11) 
        )
    )
    # Individual Heatmaps
    chart_empty = (
        base.mark_rect(color='#d1d3d4').encode().transform_filter( ##bcbec0
            (alt.datum.effect == 'effect')
        )
    )
    chart_effect = (
        base.mark_rect(stroke='black',strokeWidth = strokewidth_size).encode(
        color=alt.condition(f'datum.mutant != "entropy" && datum.mutant != "contact"', 
        alt.Color('value:Q', scale=color_scale_effect,legend=alt.Legend(title=legend_title)), 
        alt.value('transparent')),
        ).transform_filter(
            (alt.datum.effect == 'effect') & (alt.datum.wildtype != None) & (alt.datum.value != None)
            
        )
    )
    chart_entropy = (
        base.mark_rect().encode(
        color=alt.condition(f'datum.mutant == "entropy"', 
        alt.Color('value:Q', scale=color_scale_entropy,legend=None), 
        alt.value('transparent')),
        ).transform_filter(
        (alt.datum.mutant == 'entropy') & (alt.datum.wildtype != None) & (alt.datum.value != None)
            
        )
    )
    chart_contact = (
        base.mark_rect(color='black').encode(
        #color=alt.condition(f'datum.mutant == "contact"', 
        #alt.Color('value:Q', scale=color_scale_contact,legend=None), 
        #alt.value('transparent')),
        ).transform_filter(
        (alt.datum.mutant == 'contact') & (alt.datum.wildtype != None) & (alt.datum.value != None)
            
        )
    )
    # The layer for the wildtype boxes
    wildtype_layer_box = (
        alt.Chart(unique_wildtypes_df).mark_rect(color='white',stroke='black',strokeWidth = strokewidth_size).encode(
            x=alt.X('site:O', sort=sites),
            y=alt.Y('wildtype', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
            tooltip=['site','wildtype'],
            opacity=alt.value(1)  
        )
        .transform_filter(
            (alt.datum.wildtype != '') & (alt.datum.wildtype != None) & (alt.datum.value != None)
        )
    )
    # The layer for the wildtype amino acids
    wildtype_layer = (
        alt.Chart(unique_wildtypes_df).mark_text(color='black', text='X', size=8,align='center', baseline='middle').encode(
            x=alt.X('site:O', sort=sites),
            y=alt.Y('wildtype', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
            tooltip=['site','wildtype'],
            opacity=alt.value(1)  
        )
        .transform_filter(
            (alt.datum.mutant != 'entropy') & (alt.datum.wildtype != '') & (alt.datum.wildtype != None) & (alt.datum.value != None)
        )
    )    
    # Combine the heatmap layer with the wildtype layer
    chart = alt.layer(chart_empty, chart_effect, chart_entropy, chart_contact, wildtype_layer_box, wildtype_layer).resolve_scale(color='independent')
    
    charts.append(chart)
    
    combined_chart = alt.vconcat(*charts).resolve_scale(y='independent', x='independent',color='independent')
    return combined_chart.display()

plot_heatmap_full_update_test(empty_df_E2,final_sites_E2,'EFNB2 Entry')
plot_heatmap_full_update_test(empty_df_E3,final_sites_E3,'EFNB3 Entry')
In [23]:
### Show heatmap of different wildtype amino acid classes
In [24]:
hydrophobic_AA = ['A','V','L','I','M']
aromatic_AA = ['Y','W','F']
positive_AA = ['K','R','H']
negative_AA = ['E','D']
hydrophilic_AA = ['S','T','N','Q']

def find_aa_wildtype_sites(df,aa_type):
    aa_list = list(df[df['wildtype'].isin(aa_type)]['site'].unique())
    return aa_list

# Find sites where the WT are different classes of amino acids
hydrophobic_AA_list = find_aa_wildtype_sites(func_scores_E3,hydrophobic_AA)
aromatic_AA_list = find_aa_wildtype_sites(func_scores_E3,aromatic_AA)
positive_AA_list = find_aa_wildtype_sites(func_scores_E3,positive_AA)
negative_AA_list = find_aa_wildtype_sites(func_scores_E3,negative_AA)
hydrophilic_AA_list = find_aa_wildtype_sites(func_scores_E3,hydrophilic_AA)

all_AA = [hydrophobic_AA_list,aromatic_AA_list,positive_AA_list,negative_AA_list,hydrophilic_AA_list]

for i in all_AA:
    plot_heatmap_full_update_test(empty_df_E2,i,'EFNB2 Entry')
    plot_heatmap_full_update_test(empty_df_E3,i,'EFNB3 Entry')

Show heatmap of sites that conserved in all henipaviruses¶

In [25]:
plot_heatmap_full_update_test(empty_df_E2,entropy_sites,'EFNB2 Entry')
plot_heatmap_full_update_test(empty_df_E3,entropy_sites,'EFNB3 Entry')

Show heatmap of sites that only have deleterious mutations¶

In [26]:
plot_heatmap_full_update_test(empty_df_E2,intolerant_sites_E2,'EFNB2 entry')
plot_heatmap_full_update_test(empty_df_E3,intolerant_sites_E3,'EFNB3 entry')

Make heatmap of contact sites¶

In [27]:
def make_df(list_, name_):
    #print(name_)  # This will print the actual value of name_
    df = pd.DataFrame({
        'site': list_,
        name_: [0.0] * len(list_)
    })
    df = df[['site', name_]]  # Use name_ directly if it's a string
    df['mutant'] = name_  # Use name_ directly if it's a string
    df['wildtype'] = ''
    df['effect'] = 'effect'
    df.rename(columns={name_: 'value'}, inplace=True)  # Use name_ directly if it's a string
    #display(df)
    return df

# Example calls to the function
hydrophobic_df = make_df(config['hydrophobic'], 'hydrophobic')
hydrogen_df = make_df(config['h_bond_total'], 'hydrogen bonds')
salt_bridge_df = make_df(config['salt_bridges'], 'salt bridges')

def make_empty_df_contact(df):
    sites = range(71, 603)
    amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
    # Create the combination of each site with each amino acid
    data = [{'site': site, 'mutant': aa} for site in sites for aa in amino_acids]
    # Create the DataFrame
    empty_df = pd.DataFrame(data)
    all_sites_df = pd.merge(empty_df,df,on=['site','mutant'],how='left')
    


    df_test = all_sites_df.melt(id_vars=['site', 'mutant', 'wildtype'],
                                     value_vars=['effect'], 
                                     var_name='effect', value_name='value')

    df_test = pd.concat([df_test,hydrophobic_df,hydrogen_df,salt_bridge_df],ignore_index=True)
    return df_test


empty_df_E2 = make_empty_df_contact(func_scores_E2)
empty_df_E3 = make_empty_df_contact(func_scores_E3)


def plot_heatmap_full_contact_site(df,legend_title):
    # Create y-axis order list
    custom_order = ['salt bridges','hydrogen bonds','hydrophobic','R', 'K', 'H', 'D', 'E', 'Q', 'N', 'S', 'T', 'Y', 'W', 'F', 'A', 'I', 'L', 'M', 'V', 'G', 'P', 'C']    #custom_order = names + custom_order
    final_df = df

    # Sort the dataframe by 'site' to ensure that duplicates are detected correctly.
    final_df = final_df.sort_values('site')
    sort_order = {mutant: i for i, mutant in enumerate(custom_order)}
    final_df['mutant_rank'] = final_df['mutant'].map(sort_order) # Map the 'mutant' column to these ranks
    # Now sort the dataframe by this rank
    final_df = final_df.sort_values('mutant_rank')
    # Drop the 'mutant_rank' column as it is no longer needed after sorting
    final_df = final_df.drop(columns=['mutant_rank'])
    sites = sorted(final_df['site'].unique(), key=lambda x: float(x))
    
    # Prepare the color scales separately for entropy and effects
    color_scale_effect = alt.Scale(scheme='redblue', domainMid=0, domain=[-4, 2.5])
    color_scale_entropy = alt.Scale(scheme='teals', domain=[0, 1],reverse=True)
    color_scale_contact = alt.Scale(scheme='purples', domainMid=0, domain=[0, 5],reverse=True)
    strokewidth_size = 0.25
    # container to hold the charts
    charts = [] 
    #for subset in full_ranges: 
    subset_df = final_df[final_df['site'].isin(config['contact_sites'])] #for the wrapping of sites
    unique_wildtypes_df = subset_df.drop_duplicates(subset=['site','wildtype']) #for the wildtype mapping
    # The chart for the heatmap
    base = (
        alt.Chart(subset_df)
        .encode(
            x=alt.X('site:O', title='Site', sort=sites, axis=alt.Axis(labelAngle=-90)),
            y=alt.Y('mutant', title='Amino Acid', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
            tooltip=['site','wildtype','mutant','value'],
        ).properties(
            width=alt.Step(10),
            height=alt.Step(10) 
        )
    )
    # Individual Heatmaps
    chart_empty = (
        base.mark_rect(color='#d1d3d4').encode().transform_filter( ##bcbec0
            (alt.datum.effect == 'effect')
        )
    )
    chart_effect = (
        base.mark_rect(stroke='black',strokeWidth = strokewidth_size).encode(
        color=alt.condition(f'datum.mutant != "entropy" && datum.mutant != "contact"', 
        alt.Color('value:Q', scale=color_scale_effect,legend=alt.Legend(title=legend_title)), 
        alt.value('transparent')),
        ).transform_filter(
            (alt.datum.effect == 'effect') & (alt.datum.wildtype != None) & (alt.datum.value != None)
            
        )
    )
    chart_entropy = (
        base.mark_rect().encode(
        color=alt.condition(f'datum.mutant == "entropy"', 
        alt.Color('value:Q', scale=color_scale_entropy,legend=None), 
        alt.value('transparent')),
        ).transform_filter(
        (alt.datum.mutant == 'entropy') & (alt.datum.wildtype != None) & (alt.datum.value != None)
            
        )
    )
    chart_other = (
        base.mark_rect(color='black').encode(
        ).transform_filter(
        (alt.datum.mutant == 'hydrophobic') | (alt.datum.mutant == 'salt bridges') | (alt.datum.mutant == 'hydrogen bonds')
            
        )
    )
    # The layer for the wildtype boxes
    wildtype_layer_box = (
        alt.Chart(unique_wildtypes_df).mark_rect(color='white',stroke='black',strokeWidth = strokewidth_size).encode(
            x=alt.X('site:O', sort=sites),
            y=alt.Y('wildtype', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
            tooltip=['site','wildtype'],
            opacity=alt.value(1)  
        )
        .transform_filter(
            (alt.datum.wildtype != '') & (alt.datum.wildtype != None) & (alt.datum.value != None)
        )
    )
    # The layer for the wildtype amino acids
    wildtype_layer = (
        alt.Chart(unique_wildtypes_df).mark_text(color='black', text='X', size=8,align='center', baseline='middle').encode(
            x=alt.X('site:O', sort=sites),
            y=alt.Y('wildtype', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
            tooltip=['site','wildtype'],
            opacity=alt.value(1)  
        )
        .transform_filter(
            (alt.datum.mutant != 'entropy') & (alt.datum.wildtype != '') & (alt.datum.wildtype != None) & (alt.datum.value != None)
        )
    )    
    # Combine the heatmap layer with the wildtype layer
    chart = alt.layer(chart_empty, chart_effect, chart_entropy, chart_other, wildtype_layer_box, wildtype_layer).resolve_scale(color='independent')
    
    charts.append(chart)
    
    combined_chart = alt.vconcat(*charts).resolve_scale(y='independent', x='independent',color='independent')
    return combined_chart.display()

plot_heatmap_full_contact_site(empty_df_E2,'EFNB2 Entry')
plot_heatmap_full_contact_site(empty_df_E3,'EFNB3 Entry')

Plot heatmap of cysteine and n-linked glycosylation motifs¶

In [28]:
def make_empty_df(df):
    sites = range(71, 603)
    amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
    # Create the combination of each site with each amino acid
    data = [{'site': site, 'mutant': aa} for site in sites for aa in amino_acids]
    # Create the DataFrame
    empty_df = pd.DataFrame(data)
    all_sites_df = pd.merge(empty_df,df,on=['site','mutant'],how='left')
    


    df_test = all_sites_df.melt(id_vars=['site', 'mutant', 'wildtype'],
                                     value_vars=['effect'], 
                                     var_name='effect', value_name='value')

    df_test = pd.concat([entropy_df,df_test],ignore_index=True)
    return df_test


empty_df_E2 = make_empty_df(func_scores_E2)
empty_df_E3 = make_empty_df(func_scores_E3)


def plot_heatmap_glycan_cysteine(df,sites,legend_title):
    # Create y-axis order list
    custom_order = ['entropy','R', 'K', 'H', 'D', 'E', 'Q', 'N', 'S', 'T', 'Y', 'W', 'F', 'A', 'I', 'L', 'M', 'V', 'G', 'P', 'C']    #custom_order = names + custom_order
    final_df = df[df['site'].isin(sites)]
    
    # Sort the dataframe by 'site' to ensure that duplicates are detected correctly.
    final_df = final_df.sort_values('site')
    sort_order = {mutant: i for i, mutant in enumerate(custom_order)}
    final_df['mutant_rank'] = final_df['mutant'].map(sort_order) # Map the 'mutant' column to these ranks
    # Now sort the dataframe by this rank
    final_df = final_df.sort_values('mutant_rank')
    # Drop the 'mutant_rank' column as it is no longer needed after sorting
    final_df = final_df.drop(columns=['mutant_rank'])
    sites = sorted(final_df['site'].unique(), key=lambda x: float(x))
    
    # Prepare the color scales separately for entropy and effects
    color_scale_effect = alt.Scale(scheme='redblue', domainMid=0, domain=[-4, 2.5])
    color_scale_entropy = alt.Scale(scheme='purples', domain=[0, 2],reverse=True)
    color_scale_contact = alt.Scale(scheme='purples', domainMid=0, domain=[0, 5],reverse=True)
    strokewidth_size = 0.25
    # container to hold the charts
    charts = [] 
    #for subset in full_ranges: 
    subset_df = final_df[final_df['site'].isin(sites)] #for the wrapping of sites
    unique_wildtypes_df = subset_df.drop_duplicates(subset=['site','wildtype']) #for the wildtype mapping
    # The chart for the heatmap
    base = (
        alt.Chart(subset_df)
        .encode(
            x=alt.X('site:O', title=None, sort=sites, axis=alt.Axis(labelAngle=-90)),
            y=alt.Y('mutant', title='Amino Acid', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
            tooltip=['site','wildtype','mutant','value'],
        ).properties(
            width=alt.Step(10),
            height=alt.Step(11) 
        )
    )
    # Individual Heatmaps
    chart_empty = (
        base.mark_rect(color='#d1d3d4').encode().transform_filter( ##bcbec0
            (alt.datum.effect == 'effect')
        )
    )
    chart_effect = (
        base.mark_rect(stroke='black',strokeWidth = strokewidth_size).encode(
        color=alt.condition(f'datum.mutant != "entropy" && datum.mutant != "contact"', 
        alt.Color('value:Q', scale=color_scale_effect,legend=alt.Legend(title=legend_title)), 
        alt.value('transparent')),
        ).transform_filter(
            (alt.datum.effect == 'effect') & (alt.datum.wildtype != None) & (alt.datum.value != None)
            
        )
    )
    chart_entropy = (
        base.mark_rect().encode(
        color=alt.condition(f'datum.mutant == "entropy"', 
        alt.Color('value:Q', scale=color_scale_entropy,legend=None), 
        alt.value('transparent')),
        ).transform_filter(
        (alt.datum.mutant == 'entropy') & (alt.datum.wildtype != None) & (alt.datum.value != None)
            
        )
    )
    # The layer for the wildtype boxes
    wildtype_layer_box = (
        alt.Chart(unique_wildtypes_df).mark_rect(color='white',stroke='black',strokeWidth = strokewidth_size).encode(
            x=alt.X('site:O', sort=sites),
            y=alt.Y('wildtype', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
            tooltip=['site','wildtype'],
            opacity=alt.value(1)  
        )
        .transform_filter(
            (alt.datum.wildtype != '') & (alt.datum.wildtype != None) & (alt.datum.value != None)
        )
    )
    #display(unique_wildtypes_df)
    # The layer for the wildtype amino acids
    wildtype_layer = (
        alt.Chart(unique_wildtypes_df).mark_text(color='black', text='X', size=8,align='center', baseline='middle').encode(
            x=alt.X('site:O', sort=sites),
            y=alt.Y('wildtype', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
            tooltip=['site','wildtype'],
            opacity=alt.value(1)  
        )
        .transform_filter(
            (alt.datum.mutant != 'entropy') & (alt.datum.wildtype != '') & (alt.datum.wildtype != None) & (alt.datum.value != None)
        )
    )    
    # Combine the heatmap layer with the wildtype layer
    chart = alt.layer(chart_empty, chart_effect, chart_entropy, wildtype_layer_box, wildtype_layer).resolve_scale(color='independent')
    
    charts.append(chart)
    
    combined_chart = alt.vconcat(*charts).resolve_scale(y='independent', x='independent',color='independent')
    return combined_chart.display()

cysteine_neck = [146, 158, 162]
cysteine_1 = [189,601]
cysteine_2 = [216, 240]
cysteine_3 = [282,295]
cysteine_4 = [382, 395]
cysteine_5 = [387, 499]
cysteine_6 = [493, 503]
cysteine_7 = [565, 574]

cysteine = cysteine_neck + cysteine_1 + cysteine_2 + cysteine_3 + cysteine_4 + cysteine_5 + cysteine_6 + cysteine_7

n_linked = config['glycans']
stalk = list(range(96, 147))
neck = list(range(148,166))
linker = list(range(166,177))

for sites in [cysteine,n_linked,stalk,neck,linker]:
    plot_heatmap_glycan_cysteine(empty_df_E2,sites,'EFNB2 Entry')
    plot_heatmap_glycan_cysteine(empty_df_E3,sites,'EFNB3 Entry')

Check for potential neutral/beneficial glycosylation sites¶

In [29]:
def find_potential_glycan_sites(df):
    filtered = df[df['wildtype'].isin(['T', 'S'])]
    matching_sites = []
    for index, row in filtered.iterrows():
        # Check for existence of site two numbers prior with 'N' mutant and positive effect
        prior_rows = df[(df['site'] == row['site'] - 2) & (df['mutant'] == 'N') & (df['effect'] > 0)]
        if not prior_rows.empty:
            matching_sites.append(row['site'])
    unique_list1 = list(set(matching_sites))
    unique_list1 = [x - 2 for x in unique_list1]

    filtered = df[df['wildtype'].isin(['N'])]
    matching_sites = []
    for index, row in filtered.iterrows():
        # Check for existence of site two numbers prior with 'N' mutant and positive effect
        prior_rows = df[(df['site'] == row['site'] + 2) & (df['mutant'].isin(['T','S'])) & (df['effect'] > 0)]
        if not prior_rows.empty:
            matching_sites.append(row['site'])  
    unique_list2 = list(set(matching_sites))
    unique_list = unique_list1 + unique_list2
    unique_list.sort()
    print(f'The total number of potential PNLG sites are: {len(unique_list)}')
    return unique_list

PNLG = find_potential_glycan_sites(func_scores_E3)

surface = pd.read_csv(config['surface'])
surface.rename(columns={'exposure_A': 'Surface Exposure'},inplace=True)
PNLG_SURFACE = surface[surface['site'].isin(PNLG)]
PNLG_SURFACE = list(PNLG_SURFACE[PNLG_SURFACE['Surface Exposure'] >= 25]['site'].unique())

print(f'The surface exposed PNLG sites are: {PNLG_SURFACE}')

glycans = config['glycans']
filtered_PNLG_SURFACE = [num for num in PNLG_SURFACE if num not in glycans]

print(filtered_PNLG_SURFACE)

print(len(filtered_PNLG_SURFACE))
The total number of potential PNLG sites are: 33
The surface exposed PNLG sites are: [159, 180, 191, 192, 275, 288, 306, 311, 326, 378, 383, 403, 417, 423, 473, 478, 518, 543, 554, 570, 600]
[180, 191, 192, 275, 288, 311, 326, 383, 403, 423, 473, 478, 518, 543, 554, 570, 600]
17
In [ ]: